#!/usr/bin/env bash
# tsv2tsj -- Converts PostgreSQL TSV lines into TSJ, tab-separated JSONs lines
#
# $ tsv2tsj COLUMN_TYPE... <input.tsv >output.tsj
#
# The columns in the input TSV should have the same order as the given
# COLUMN_TYPEs.
#
# Currently, JSON object/array, and some primitive SQL data types such as
# numbers do not receive any transcoding. TEXT and other types are encoded into
# a JSON string.
#
# PostgreSQL arrays may get translation in the future.
##
set -euo pipefail

[[ $# -gt 0 ]] || usage "$0" "Missing COLUMN_TYPE"

perlHandlerForSqlType() {
    # See: https://www.postgresql.org/docs/current/static/datatype.html
    local type=$1
    case $type in
        # support PostgreSQL arrays
        *"[]")
            echo "json_array sub {$(perlHandlerForSqlType "${type%[]}") shift}, "
            ;;

        # no need to quote JSON or other numeric types
        JSON|\
        INTEGER|SMALLINT|INT2|INT|INT4|BIGINT|INT8|\
        DECIMAL|NUMERIC|"NUMERIC("*")"|\
        REAL|"DOUBLE PRECISION"|\
        FLOAT|"FLOAT("*")"|\
        SERIAL|SERIAL4|BIGSERIAL|SERIAL8|SMALLSERIAL|SERIAL2)
            echo json_value
            ;;

        # Booleans have a special encoding in PGTSV
        BOOLEAN)
            echo json_boolean
            ;;

        # Otherwise, treat as JSON string
        TEXT|VARCHAR|"VARCHAR("*")"|\
        "CHAR("*")"|\
        *)
            echo json_string
    esac
}

# from given types for columns, compose some lines of Perl that maps PostgreSQL TSV format to JSON
plStmts=
for ColumnType; do
    plStmts+='print "\t";
        print '"$(perlHandlerForSqlType "$(tr a-z A-Z <<<"$ColumnType")")"' shift;
        '
done
plStmts=${plStmts#'print "\t";'}

tmpdir=$(mktemp -d "${TMPDIR:-/tmp}"/tsv2tsj.XXXXXXX)
trap 'cd /; rm -rf "$tmpdir"' EXIT
cd "$tmpdir"

# generate code
(
    cat "$0".pl
    echo '
    while ( <> ) {
        chomp;
        @ARGV = split "\t";
        '"$plStmts"'
        print "\n";
    }
    '
) >tsv2tsj-$$.pl
chmod +x tsv2tsj-$$.pl

if [[ ${DEEPDIVE_USE_TSV2TSJ_NPROCS:=$(nproc)} -gt 1 ]]; then
    # run in parallel (increases throughput 2-3x in with 8 cores but not that much)
    pids=(--)
    mkprocs -n $DEEPDIVE_USE_TSV2TSJ_NPROCS -I input-%s -O output-%s ./tsv2tsj-$$.pl
    mkmimo output-* \> /dev/stdout & pids+=($!)
    mkmimo /dev/stdin \> input-*
    set "${pids[@]}"; for pid; do wait $pid; done
else
    # single threaded
    exec ./tsv2tsj-$$.pl
fi
